library(data.table)
library(ggplot2)
library(viridis)
library(scales)
library(tidyverse)
library(ape)
library(igraph)
library(ggExtra)

##########################################
# Reading in the data.                   #
##########################################

# Defining Common Colors
c_green <- "#009E73"
c_blue <- "#0072B2"
c_red <- "#D55E00"
c_black <- "#000000"

# Import all technical reuse found.
reused <- fread("Reuse_Found.csv")
reused$From_Release <- as.Date(reused$From_Release)
reused$In_Release <- as.Date(reused$In_Release)
reused$From <- tolower(substr(reused$From,1,nchar(reused$From)-4))
reused$In <- tolower(substr(reused$In,1,nchar(reused$In)-4))

# Import all comparisons. (Includes Meta Data)
combined_meta <- fread("Combined_Meta.csv")
combined_meta$From_Release <- as.Date(combined_meta$From_Release)
combined_meta$In_Release <- as.Date(combined_meta$In_Release)

# Import first seen dates. (Combined first price point on CMC and first date from ICOBench)
firstseen <- read.csv("FirstSeenDate.csv")
firstseen$Release <- as.Date(firstseen$Release)

# Import ICO list.
icocoins <- read.csv('ICOList.csv', stringsAsFactors = F)
names(icocoins) <- c('Rank', 'Coin')

# Import ICO data parsed from ICOBench
icobench <- read.csv("ICObench_data.csv")
icobench <- icobench[, c(1,7,8,11)]
icobench$ico <- tolower(icobench$ico)
names(icobench) <- c("Coin", "Rating", "BenchyRating", "Country")


##########################################
# Plotting the CDF of reuse by method.   #
##########################################

# Plot CDF of only reused. (NOT USED IN PAPER)
plot(ecdf(reused$N2), verticals=TRUE, do.points=FALSE)
plot(ecdf(reused$N3), verticals=TRUE, do.points=FALSE, add=TRUE, col='brown')
plot(ecdf(reused$N4), verticals=TRUE, do.points=FALSE, add=TRUE, col='blue')
plot(ecdf(reused$TFIDF), verticals=TRUE, do.points=FALSE, add=TRUE, col='orange')



# WARNING
# This takes a long time to complete, the dataframe it's using is huge.

#Combine all groups for easier computation with ggplot.
combined_factored <- data.frame(vals = c(combined_meta$N2, combined_meta$N3,
                                         combined_meta$N4, combined_meta$TFIDF),
                                group = gl(4, nrow(combined_meta)))

p <- ggplot(combined_factored, aes(vals*100+1, colour = group)) + stat_ecdf(geom = 'step', size = 1) +
     labs(title = "CDF of Overlap by Method", y = "F(n)", x="Text Reuse (%)",
          colour = "Method") +
     scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
                   labels = trans_format("log10", math_format(10^.x))) +
     scale_color_manual(labels = c("N2", "N3", "N4", "TFIDF"), values=c(c_green, c_blue, c_red, c_black)) +
        theme_bw() +
     theme(axis.text = element_text(size = 15),
           axis.title=element_text(size=18, face="bold"),
           plot.title = element_text(size = 20, face = 'bold', hjust = 0.5),
           legend.key.size = unit(1, 'cm'),
           legend.text = element_text(size = 16, face = 'bold'),
           legend.title = element_text(size=18))

p



##########################################
# Binning the data.                      #
##########################################

# Technical Text Reuse
tech_reuse <- reused[which((is.na(reused$In_Rank) | !is.na(reused$From_Release)) & reused$isTechnical == 'TRUE'), c(1:10)]

# Legal Text Reuse
legal_reuse <- reused[which((is.na(reused$In_Rank) | !is.na(reused$From_Release)) & reused$isLegal == 'TRUE'), c(1:10)]


# Setup the bins.
breakpoints <- c(0, 50.5, 500.5, Inf)
bins <- c('1 1-50', '2 51-500', '3 501+')


legal_reuse$From_Bin <- cut(legal_reuse$From_Rank, breaks = breakpoints, labels = bins)
legal_reuse$In_Bin <- cut(legal_reuse$In_Rank, breaks = breakpoints, labels = bins)
legal_reuse$From_Bin <- as.character(legal_reuse$From_Bin)
legal_reuse$In_Bin <- as.character(legal_reuse$In_Bin)
legal_reuse$In_Bin[is.na(legal_reuse$In_Bin)] <- 'ICO'
legal_table <- table(legal_reuse$In_Bin, legal_reuse$From_Bin)
legal_table

tech_reuse$From_Bin <- cut(tech_reuse$From_Rank, breaks = breakpoints, labels = bins)
tech_reuse$In_Bin <- cut(tech_reuse$In_Rank, breaks = breakpoints, labels = bins)
tech_reuse$From_Bin <- as.character(tech_reuse$From_Bin)
tech_reuse$In_Bin <- as.character(tech_reuse$In_Bin)
tech_reuse$In_Bin[is.na(tech_reuse$In_Bin)] <- 'ICO'
tech_table <- table(tech_reuse$In_Bin, tech_reuse$From_Bin)
tech_table

# Uniques per bin

nrow(unique(tech_reuse[tech_reuse$From_Bin == '1 1-50',1])) # Unique from top 50
nrow(unique(tech_reuse[tech_reuse$From_Bin == '2 51-500',1])) # Unique from 51-500
nrow(unique(tech_reuse[tech_reuse$From_Bin == '3 501+',1])) # Unique from 501+

nrow(unique(legal_reuse[legal_reuse$From_Bin == '1 1-50',1])) # Unique from top 50
nrow(unique(legal_reuse[legal_reuse$From_Bin == '2 51-500',1])) # Unique from 51-500
nrow(unique(legal_reuse[legal_reuse$From_Bin == '3 501+',1])) # Unique from 501+




##########################################
# Chi-Squared Tests.                     #
##########################################


# Technical Reuse Chisq Test

chisq.test(c(sum(tech_table[1:nrow(tech_table)-1,1]),
             sum(tech_table[1:nrow(tech_table)-1,2]),
             sum(tech_table[1:nrow(tech_table)-1,3])),
           c(tech_table[nrow(tech_table),1],
             tech_table[nrow(tech_table),2],
             tech_table[nrow(tech_table),3]))

# Legal Reuse Chisq Test

chisq.test(c(sum(legal_table[1:nrow(legal_table)-1,1]),
             sum(legal_table[1:nrow(legal_table)-1,2]),
             sum(legal_table[1:nrow(legal_table)-1,3])),
           c(legal_table[nrow(legal_table),1],
             legal_table[nrow(legal_table),2],
             legal_table[nrow(legal_table),3]))

legal_reuse$From_Bin <- NULL
legal_reuse$In_Bin <- NULL

tech_reuse$From_Bin <- NULL
tech_reuse$In_Bin <- NULL




##########################################
# Differencing the first seen times.     #
##########################################

combined_meta$timediff <- as.integer(combined_meta$In_Release - combined_meta$From_Release)
tech_reuse$timediff <- as.integer(tech_reuse$In_Release - tech_reuse$From_Release)
legal_reuse$timediff <- as.integer(legal_reuse$In_Release - legal_reuse$From_Release)

pdf('CDFTimeDiff-log.pdf', width = 7, height = 7)
# plot(ecdf(combined_meta$timediff+1), verticals=TRUE, do.points=FALSE, main = "CDF of Time Difference Between First Seen Dates",
#      xlab = "Time (Days)", lwd = 2, cex.lab = 1.25, cex.axis = 1.2, cex.main=1.5, col = c_blue, log = 'x',
#      xlim = c(1,2278))
# plot(ecdf(c(tech_reuse$timediff, legal_reuse$timediff)), verticals=TRUE, do.points=FALSE, add=TRUE, lwd = 2)
plot(ecdf(tech_reuse$timediff+1), verticals=TRUE, do.points=FALSE, main = "CDF of Time Difference Between First Seen Dates",
          xlab = "Time (Days)", lwd = 2, cex.lab = 1.25, cex.axis = 1.2, cex.main=1.5, col = c_red, log = 'x',
          xlim = c(1,2278))
plot(ecdf(legal_reuse$timediff+1), verticals=TRUE, do.points=FALSE, add=TRUE, col = c_black, lwd = 2)
legend('left', legend = c('High Sim.', 'Moderate Sim.'), col = c(c_red, c_black),
lty = 1, lwd = 2)
# legend('right', legend = c('All Comparisons', 'Text Reuse'), col = c(c_blue, c_black),
#        lty = 1, lwd = 2)
dev.off()

# Mean difference between copying dates
summary(c(tech_reuse$timediff, legal_reuse$timediff))

# Mean difference between legit dates
summary(combined_meta$timediff)

# Overlap Difference
total_active_reuse <- sum(c(sum(tech_table[1:nrow(tech_table)-1,1]),
                      sum(tech_table[1:nrow(tech_table)-1,2]),
                      sum(tech_table[1:nrow(tech_table)-1,3]),
                      sum(legal_table[1:nrow(legal_table)-1,1]),
                      sum(legal_table[1:nrow(legal_table)-1,2]),
                      sum(legal_table[1:nrow(legal_table)-1,3])))
total_ICO_reuse <- sum(c(tech_table[nrow(tech_table),1],
                         tech_table[nrow(tech_table),2],
                         tech_table[nrow(tech_table),3],
                         legal_table[nrow(legal_table),1],
                         legal_table[nrow(legal_table),2],
                         legal_table[nrow(legal_table),3]))
total_ICO_reuse/total_active_reuse


##########################################
# Barplot of ICO release dates.          #
##########################################

# Build a data frame of ICO coins with their first seen dates.
icocoins$Coin <- tolower(substr(icocoins$Coin,1,nchar(icocoins$Coin)-4))
setDT(icocoins)
setDT(firstseen)
icocoins <- firstseen[icocoins, mult = "first", on = "Coin", nomatch=0L]

# Label each reuse based on type, and add year of first seen.
icocoins$reuse <- unlist(lapply(icocoins$Coin, FUN = function(x) ifelse(x %in% reused$In, T, F)))
icocoins$reuse_tech <- unlist(lapply(icocoins$Coin, FUN = function(x) ifelse(x %in% tech_reuse$In, T, F)))
icocoins$reuse_legal <- unlist(lapply(icocoins$Coin, FUN = function(x) ifelse(x %in% legal_reuse$In, T, F)))
icocoins$startyear <- format(as.Date(icocoins$Release),"%Y")
icocoins_reused <- icocoins[icocoins$reuse,]
icocoins_reused_tech <- icocoins[icocoins$reuse_tech,]
icocoins_reused_legal <- icocoins[icocoins$reuse_legal,]

# Combine all the data frames for the barplot.
forbarplot <- icocoins
forbarplot$cat <- 'All'
icocoins_reused$cat <- 'Combined Reuse Found'
icocoins_reused_tech$cat <- 'Tech Reuse Found'
icocoins_reused_legal$cat <- 'Legal Reuse Found'
forbarplot <- do.call("rbind", list(forbarplot, icocoins_reused_tech, icocoins_reused_legal))

pdf("ICOBarplotStartDate.pdf", width = 9, height = 5)
par(mar=c(4,4,2,1))
barplot(table(forbarplot$cat, forbarplot$startyear), 
        main="ICO Start Date Distribution Over Time", beside = T,
        xlab="Year", col=c(c_blue,c_black,c_red), log='y', cex.lab = 1.5, cex.axis = 1.4, cex.names = 1.4, cex.main=1.5)
legend("topleft", c("All", "Moderate Sim.", "High Sim."), col=c(c_blue,c_black,c_red), pch=15, cex = 1.2)
# legend("topleft", c("All", "Combined Reuse", "Legal Reuse", "Tech Reuse"), col=c("darkgreen","lightblue","red","purple"), pch=15)
dev.off()




##########################################
# Benchy rating vs reuse ratings.        #
##########################################

# Add rating, benchy rating, and country to the icocoins.
icocoins <- as.data.frame(merge(icocoins, icobench, by = 'Coin'))

pdf("densityratings.pdf",width=7,height=7)
par(mar=c(4,4,2,1))

# Plot ICO High Sim Expert
plot(density(icocoins[icocoins$reuse_tech,'Rating']), xlim=c(0.5,5), ylim=c(0,0.6),
     lwd=2.5, main="", xlab = "0-5 Rating", cex.lab = 1.25, cex.axis = 1.2, col=c_red)
# Plot ICO All Sim Expert
lines(density(icocoins$Rating), col=c_blue,lwd=2.5, xlim=c(0.5,5), ylim=c(0,0.6))
# Plot ICO High Sim Benchy
lines(density(icocoins[icocoins$reuse_tech,'BenchyRating']), xlim=c(0.5,5),
     ylim=c(0,0.6), lwd=2.5, lty=3, col=c_red)
# Plot ICO All Sim Benchy
lines(density(icocoins$BenchyRating), col=c_blue,lwd=2.5, xlim=c(0.5,5), ylim=c(0,0.6),
     lty=3)

legend("topleft",c("High Sim. Expert Rating","All Expert Rating","High Sim. Benchy Rating","All Benchy Rating"),
       col=c(c_red, c_blue, c_red, c_blue), lwd=2, lty=c(1,1,3,3), cex = 1.1)
dev.off()

# Info for Wilcox Test

summary(icocoins$Rating) # All Expert Ratings
summary(icocoins$BenchyRating) # All Benchy Ratings
summary(icocoins[icocoins$reuse_tech,'Rating']) # Reuse Expert Ratings
summary(icocoins[icocoins$reuse_tech,'BenchyRating']) # Reuse Benchy Ratings


wilcox.test(icocoins$Rating, icocoins[icocoins$reuse_tech,'Rating'])
wilcox.test(icocoins$BenchyRating, icocoins[icocoins$reuse_tech,'BenchyRating'])

# summary(icocoins$Rating)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.000   2.700   3.200   3.199   3.800   4.700 
# summary(icocoins$BenchyRating)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.100   2.700   3.000   3.142   3.700   5.000 
# summary(icocoins[icocoins$reuse_tech,'Rating'])
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.300   2.900   3.300   3.329   3.900   4.700 
# summary(icocoins[icocoins$reuse_tech,'BenchyRating'])
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.300   2.800   3.200   3.233   3.800   5.000 



##########################################
# Minimum Spanning Tree of data.         #
##########################################

# Build the distance matrix for N2.
n2_only <- tech_reuse[,c(1,4,7)]
extra <- n2_only[,c(2,1,3)]
names(extra) <- names(n2_only)
n2_only <- rbind(n2_only, extra)


# Info on pivot_wider: https://tidyr.tidyverse.org/reference/pivot_wider.html
edge_mat = n2_only %>% 
        pivot_wider(id_cols = From, names_from = In, values_from = N2, values_fn = mean, values_fill = 0) %>%
        column_to_rownames(var = "From")
edge_mat <- edge_mat[order(row.names(edge_mat)),order(names(edge_mat))]
edge_mat <- 1 - edge_mat
edge_mat

# I think it works as a data frame but put it in a matrix.
g <- graph.adjacency(as.matrix(edge_mat),
        mode="undirected",
        weighted=TRUE, # We use the text reuse metric as the weight.
        diag=FALSE)

# Remove multiples and loops.
g <- simplify(g, remove.multiple=TRUE, remove.loops=TRUE)

# Remove edges with weight equal to 1 (aka, no relation)
g <- delete_edges(g, E(g)[which(E(g)$weight == 1)])

# Remove nodes with no edges.
g <- delete.vertices(g, degree(g)==0)

# I got this example from https://www.biostars.org/p/285296/,
# but I would like to scale the nodes based on rank.

# Remove clusters with only one edge between them.
cl <- clusters(g)
small.clusters <- which(cl$csize <= 2)
vertices.to.delete <- which(cl$membership %in% small.clusters)
g <- delete.vertices(g, vertices.to.delete)
V(g)$vertex.frame.color <- "white"

# Create minimum spanning tree.
mst <- mst(g, algorithm="prim")

# node_scale <- function(x){1/}
node_size <- (1/((unlist(lapply(names(mst[1]), 
                              FUN = function(x) tech_reuse[tech_reuse$From == x, 2][1]))/800)+1))*10
node_size[is.na(node_size)] <- 0.00001 

labels <- unlist(lapply(names(mst[1]),
                 FUN = function(x) ifelse(tech_reuse[tech_reuse$From == x, 2][1] > 300, NA, x)))

# Identify communities based on weights.
mst.communities <- edge.betweenness.community(mst, directed=FALSE)
mst.clustering <- make_clusters(mst, membership=mst.communities$membership)
V(mst)$color <- mst.communities$membership + 1

tkid <- tkplot(g) 

# Plots are non-deterministic based on layout. Here are layouts:
# https://igraph.org/c/doc/igraph-Layout.html

plot(
        mst.communities, mst,
        # layout=l,
        edge.curved=F,
        edge.width=1.7,
        vertex.size=node_size,
        vertex.label.dist=.1,
        vertex.label.color="black",
        # vertex.label=labels,
        vertex.label=NA,
        asp=FALSE,
        vertex.label.cex=1,
        edge.arrow.mode=0,
        main="High Similarity Communities")
text(c(0.2, 1.0, -0.85), c(0.29, -0.5, -0.85), c("Bitcoin", "Monero", "Cryptonex"))
text(c(-0.37, 0.2, -0.70), c(-0.3, 0.8, 0.80), c("Heat-Ledger", "Theta", "Cryptoinvest"))
text(-0.6, 0, "Sealchain")


# Manually move nodes if needed.

tkid <- tkplot(mst, layout=l) 

l <- tkplot.getcoords(tkid) 

tk_close(tkid, window.close = T)



##########################################
# Basic stats about data.                #
##########################################

# Total ICO coins scraped: 2038

# How many ICO's had significant reuse?
length(unique(icocoins_reused$Coin))

# Percent of total:
length(unique(icocoins_reused$Coin))/2038

# How many ICO's had significant TECHNICAL reuse?
length(unique(icocoins_reused_tech$Coin))

# Percent of total
length(unique(icocoins_reused_tech$Coin))/2038


# Total CMC coins scraped: 1265

# How many CMC's had significant reuse?
reused_cmc <- reused[!is.na(reused$In_Rank),]
length(unique(reused_cmc$In))

# Percent of total
length(unique(reused_cmc$In))/1265

# How many CMC's had significant TECHNICAL reuse?
reused_cmc_tech <- reused[!is.na(reused$In_Rank) & reused$isTechnical,]
length(unique(reused_cmc_tech$In))

# Percent of total
length(unique(reused_cmc_tech$In))/1265


breakpoints <- seq(0, 100, 10)
break_lables <- c('0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
                  '71-80', '81-90', '91-100')

without_zero <- combined_meta
without_zero[without_zero == 0] <- 0.000001
without_zero[without_zero > 1] <- 1
all_reuse <- data.frame(N2 = cut(without_zero$N2*100, breaks = breakpoints, labels = break_lables, right = TRUE))
all_reuse$N3 <-  cut(without_zero$N3*100, breaks = breakpoints, labels = break_lables, right = TRUE)
all_reuse$N4 <-  cut(without_zero$N4*100, breaks = breakpoints, labels = break_lables, right = TRUE)
all_reuse$TFIDF <-  cut(without_zero$TFIDF*100, breaks = breakpoints, labels = break_lables, right = TRUE)
summary(all_reuse, maxsum = 15)

# Summary Stats Table
high_sim <- tech_reuse[!is.na(tech_reuse$From_Rank),]
mod_sim <- legal_reuse[!is.na(legal_reuse$From_Rank),]

# Identify Coin Copiers/Token Copiers
high_sim_coin <- high_sim[!is.na(high_sim$In_Rank),]
high_sim_token <- high_sim[which(is.na(high_sim$In_Rank) & !(high_sim$In %in% high_sim_coin$In)),]

mod_sim_coin <- mod_sim[!is.na(mod_sim$In_Rank),]
mod_sim_token <- mod_sim[which(is.na(mod_sim$In_Rank) & !(mod_sim$In %in% mod_sim_coin$In)),]

# Identify Impersonated Count
length(unique(high_sim_coin$From))
length(unique(high_sim_coin$In))
length(unique(high_sim_token$From))
length(unique(high_sim_token$In))

length(unique(mod_sim_coin$From))
length(unique(mod_sim_coin$In))
length(unique(mod_sim_token$From))
length(unique(mod_sim_token$In))

#Total Pairs
nrow(high_sim_coin)
nrow(high_sim_token)
nrow(mod_sim_coin)
nrow(mod_sim_token)

# Total Unique High
length(unique(high_sim$In))

# Total Unique Mod
length(unique(mod_sim$In))

# Total Unique High
length(unique(high_sim$From))

# Total Unique Mod
length(unique(mod_sim$From))

total_coins <- 1260
total_ICOs <- 2039

# Percentage of COINS involved in high text reuse.
length(unique(high_sim_coin$In))/total_coins

# Percentage of COINS involved in mod text reuse.
length(unique(mod_sim_coin$In))/total_coins

# Percentage of TOKENS involved in high text reuse.
length(unique(high_sim_token$In))/total_ICOs

# Percentage of TOKENS involved in mod text reuse.
length(unique(mod_sim_token$In))/total_ICOs


tech_icos <- unique(high_sim_token$In)
icobench$Impersonated <- unlist(lapply(icobench$Coin, 
                                       FUN = function(x) ifelse(x %in% tech_icos, T, F)))
icobench_high_sim <- icobench[icobench$Impersonated,]

pdf("densityratings.pdf",width=7,height=7)
par(mar=c(4,4,2,1))

# Plot ICO High Sim Expert
plot(density(icobench_high_sim$Rating), xlim=c(0.5,5), ylim=c(0,0.6),
     lwd=2.5, main="", xlab = "0-5 Rating", cex.lab = 1.25, cex.axis = 1.2, col=c_red)
# Plot ICO All Sim Expert
lines(density(icobench$Rating), col=c_blue,lwd=2.5, xlim=c(0.5,5), ylim=c(0,0.6))
# Plot ICO High Sim Benchy
lines(density(icobench_high_sim$BenchyRating), xlim=c(0.5,5),
      ylim=c(0,0.6), lwd=2.5, lty=3, col=c_red)
# Plot ICO All Sim Benchy
lines(density(icobench$BenchyRating), col=c_blue,lwd=2.5, xlim=c(0.5,5), ylim=c(0,0.6),
      lty=3)

legend("topleft",c("High Sim. Expert Rating","All Expert Rating","High Sim. Benchy Rating","All Benchy Rating"),
       col=c(c_red, c_blue, c_red, c_blue), lwd=2, lty=c(1,1,3,3), cex = 1.1)
dev.off()

summary(icobench$Rating)
summary(icobench$BenchyRating)
summary(icobench_high_sim$Rating)
summary(icobench_high_sim$BenchyRating)


wilcox.test(icobench$Rating, icobench_high_sim$Rating)
wilcox.test(icobench$BenchyRating, icobench_high_sim$BenchyRating)

##########################################
# Heatmap of the overlaps.               #
##########################################


heat_data <- read.csv('Locations_Over_Threshold.csv', stringsAsFactors = F)
heat_data$Locations <- strsplit(gsub("\\[|\\]", '', heat_data$Locations), split = ", ")

heat_data2 <- data.frame(Comparison=apply(heat_data, 1, FUN = function(x) 
        paste(x$From, '-', x$In, sep='')))
x <- do.call(rbind, heat_data$Locations)
colnames(x) <- 1:ncol(x)
heat_data2 <- cbind(heat_data2["Comparison"], x)
heat_data2$Category <- apply(heat_data, 1, FUN = function(x) ifelse(x[8] == "TRUE" & x[7] == "FALSE", 'Tech', 
                                                                    ifelse(x[8] == "TRUE" & x[7] == "TRUE", 'Both',
                                                                           ifelse(x[8] == "FALSE" & x[7] == "TRUE", 'Legal', 'None'))))
heat_data2 <- melt(data = heat_data2, id.vars = c("Comparison", "Category"),
                     measure.vars = as.character(1:100))
heat_data2$Reuse <- as.numeric(heat_data2$value)
heat_data2$Percent_Location <- as.numeric(heat_data2$variable)
heat_data2$Scaled_Reuse <- heat_data2$Reuse*100

pdf('HEAT-tech.pdf')
tech_map <-ggplot(heat_data2[heat_data2$Category == "Tech",],aes(Comparison,Percent_Location,fill=Scaled_Reuse))+
        geom_tile(color= "white",size=0.1) + 
        # scale_fill_gradient(low = muted(c_green), high = c_red)
        scale_fill_viridis(name="Reuse Amount (%)", discrete = FALSE, limits = c(0,1.07))
tech_map <-tech_map + scale_y_continuous(trans = "reverse", breaks = c(1,25,50,75, 100))
tech_map <-tech_map + theme_minimal(base_size = 8)
tech_map <-tech_map + labs(title = "Technical Only", x="Paper", y="Location Within Paper (%)")
tech_map <-tech_map + theme(legend.position = "bottom")+
        theme(legend.key.size = unit(1, 'cm')) + 
        theme(plot.title=element_text(size = 30))+
        theme(axis.text.y=element_text(size=15)) +
        theme(strip.background = element_rect(colour="white"))+
        theme(plot.title=element_text(hjust=0))+
        theme(axis.ticks=element_blank())+
        theme(axis.text=element_text(size=20))+
        theme(legend.title=element_text(size=15))+
        theme(legend.text=element_text(size=12))+
        theme(axis.text.x=element_blank()) +
        theme(axis.title = element_text(size = 15)) +
        theme(plot.title = element_text(hjust = 0.5)) +
        removeGrid()#ggExtra
tech_map
dev.off()

pdf('HEAT-legal.pdf')
legal_map <-ggplot(heat_data2[heat_data2$Category == "Legal",],aes(Comparison,Percent_Location,fill=Scaled_Reuse))+
        geom_tile(color= "white",size=0.1) + 
        scale_fill_viridis(name="Reuse Amount (%)", discrete = FALSE, limits = c(0,1.07))
legal_map <-legal_map + scale_y_continuous(trans = "reverse", breaks = c(1,25,50,75, 100))
legal_map <-legal_map + theme_minimal(base_size = 8)
legal_map <-legal_map + labs(title = "Legal Only", x="Paper", y="Location Within Paper (%)")
legal_map <-legal_map + theme(legend.position = "bottom")+
        theme(legend.key.size = unit(1, 'cm')) + 
        theme(plot.title=element_text(size = 30))+
        theme(axis.text.y=element_text(size=15)) +
        theme(strip.background = element_rect(colour="white"))+
        theme(plot.title=element_text(hjust=0))+
        theme(axis.ticks=element_blank())+
        theme(axis.text=element_text(size=20))+
        theme(legend.title=element_text(size=15))+
        theme(legend.text=element_text(size=12))+
        theme(axis.text.x=element_blank()) +
        theme(axis.title = element_text(size = 15)) +
        theme(plot.title = element_text(hjust = 0.5)) +
        removeGrid()#ggExtra
legal_map
dev.off()

pdf('HEAT-both.pdf')
both_map <-ggplot(heat_data2[heat_data2$Category == "Both",],aes(Comparison,Percent_Location,fill=Scaled_Reuse))+
        geom_tile(color= "white",size=0.1) + 
        scale_fill_viridis(name="Reuse Amount (%)", discrete = FALSE, limits = c(0,1.07))
both_map <-both_map + scale_y_continuous(trans = "reverse", breaks = c(1,25,50,75, 100))
both_map <-both_map + theme_minimal(base_size = 8)
both_map <-both_map + labs(title = "Technical + Legal", x="Paper", y="Location Within Paper (%)")
both_map <-both_map + theme(legend.position = "bottom")+
        theme(legend.key.size = unit(1, 'cm')) + 
        theme(plot.title=element_text(size = 30))+
        theme(axis.text.y=element_text(size=15)) +
        theme(strip.background = element_rect(colour="white"))+
        theme(plot.title=element_text(hjust=0))+
        theme(axis.ticks=element_blank())+
        theme(axis.text=element_text(size=20))+
        theme(legend.title=element_text(size=15))+
        theme(legend.text=element_text(size=12))+
        theme(axis.text.x=element_blank()) +
        theme(axis.title = element_text(size = 15)) +
        theme(plot.title = element_text(hjust = 0.5)) +
        removeGrid()#ggExtra
both_map
dev.off()


none_map <-ggplot(heat_data2,aes(Comparison,Percent_Location,fill=Reuse))+
        geom_tile(color= "white",size=0.1) + 
        scale_fill_viridis(name="Overlap Amount (%) Normalized", discrete = FALSE)
none_map <-none_map + scale_y_continuous(trans = "reverse", breaks = c(1,25,50,75, 100))
none_map <-none_map + theme_minimal(base_size = 8)
none_map <-none_map + labs(title = "Legal Text Reuse", x="Paper", y="Location Within Paper (%)")
none_map <-none_map + theme(legend.position = "bottom")+
        theme(plot.title=element_text(size = 14))+
        theme(axis.text.y=element_text(size=6)) +
        theme(strip.background = element_rect(colour="white"))+
        theme(plot.title=element_text(hjust=0))+
        theme(axis.ticks=element_blank())+
        theme(axis.text=element_text(size=7))+
        theme(legend.title=element_text(size=8))+
        theme(legend.text=element_text(size=6))+
        theme(axis.text.x=element_blank()) +
        removeGrid()#ggExtra
none_map





tech_unique <- as.data.frame(table(tech_reuse$In))
legal_unique <- as.data.frame(table(legal_reuse$In))



plot(density(unlist(combined_meta[combined_meta$N2>0.2,7])*100), log='x', xlim = c(1,100))
lines(density(unlist(combined_meta[combined_meta$N3>0.01,8])*100))
lines(density(unlist(combined_meta[combined_meta$N4>0.05,9])*100))
lines(density(unlist(combined_meta[combined_meta$TFIDF>0.1,10])*100))
